pubchem.compound Schema Extraction¶
This notebook demonstrates RDF schema extraction from the pubchem.compound SPARQL endpoint. It discovers VoID (Vocabulary of Interlinked Datasets) descriptions and generates JSON-LD as the source for all downstream outputs including frequency analysis and LinkML schemas.
Exports¶
- JSON-LD Schema (primary output)
- N-Quads RDF
- VoID Graph for the dataset in its original source
- Coverage report
- LinkML Schema
- Full parquet entity dataframe
In [1]:
# Dataset Configuration
import os
# Dataset parameters
endpoint_url = "https://idsm.elixir-czech.cz/sparql/endpoint/idsm"
dataset_name = "pubchem.compound"
void_iri = "http://rdf.ncbi.nlm.nih.gov/pubchem/compound"
graph_uri = "http://rdf.ncbi.nlm.nih.gov/pubchem/compound"
# Setup paths
working_path = os.path.abspath("")
exports_path = os.path.join(
working_path, "..", "..", "docs", "data", "schema_extraction", dataset_name
)
os.makedirs(exports_path, exist_ok=True)
In [2]:
import logging
import sys
# Minimal notebook logger using existing dataset_name
logger = logging.getLogger(dataset_name or "notebook")
logger.setLevel(logging.DEBUG) # Set to DEBUG to see SPARQL queries
# Also configure the rdfsolve.parser logger to see query details
parser_logger = logging.getLogger("rdfsolve.parser")
parser_logger.setLevel(logging.DEBUG)
# Avoid adding duplicate handlers if the cell is re-run
if not logger.handlers:
fmt = logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s", "%Y-%m-%d %H:%M:%S")
sh = logging.StreamHandler(sys.stdout)
sh.setLevel(logging.DEBUG) # Set to DEBUG to see all logs
sh.setFormatter(fmt)
logger.addHandler(sh)
# Add the same handler to the parser logger
parser_logger.addHandler(sh)
logger.info(f"Logging configured for {dataset_name}")
2025-12-02 10:21:22 INFO pubchem.compound: Logging configured for pubchem.compound
In [3]:
# Import libraries
import json
# Configure Plotly for HTML output
import plotly.io as pio
import plotly.offline as pyo
from IPython.display import Markdown, display
# Import rdfsolve API functions
from rdfsolve.api import (
discover_void_graphs,
generate_void_from_endpoint,
load_parser_from_graph,
retrieve_void_from_graphs,
)
from rdfsolve.sparql_helper import SparqlHelper
# Enable query collection to track all SPARQL queries executed
SparqlHelper.enable_query_collection()
# Set renderer to 'notebook' for Jupyter, but ensure HTML export works
pio.renderers.default = "notebook+plotly_mimetype"
# Initialize offline mode for Plotly
pyo.init_notebook_mode(connected=True)
In [4]:
# Pickle caching utilities
import os
import pickle
def save_cache(data, filename, cache_dir=None):
"""Save data to pickle cache."""
if cache_dir is None:
cache_dir = os.path.join(exports_path, "cache")
os.makedirs(cache_dir, exist_ok=True)
cache_path = os.path.join(cache_dir, f"{filename}.pkl")
with open(cache_path, "wb") as f:
pickle.dump(data, f)
print(f"Cached data to: {cache_path}")
return cache_path
def load_cache(filename, cache_dir=None):
"""Load data from pickle cache if it exists."""
if cache_dir is None:
cache_dir = os.path.join(exports_path, "cache")
cache_path = os.path.join(cache_dir, f"{filename}.pkl")
if os.path.exists(cache_path):
with open(cache_path, "rb") as f:
data = pickle.load(f)
print(f"Loaded cached data from: {cache_path}")
return data
return None
def cache_exists(filename, cache_dir=None):
"""Check if cache file exists."""
if cache_dir is None:
cache_dir = os.path.join(exports_path, "cache")
cache_path = os.path.join(cache_dir, f"{filename}.pkl")
return os.path.exists(cache_path)
In [5]:
# Cache management utilities
def list_cache_files(cache_dir=None):
"""List all cache files."""
if cache_dir is None:
cache_dir = os.path.join(exports_path, "cache")
if not os.path.exists(cache_dir):
print("No cache directory found")
return []
cache_files = [f for f in os.listdir(cache_dir) if f.endswith(".pkl")]
print(f"Cache directory: {cache_dir}")
for f in cache_files:
file_path = os.path.join(cache_dir, f)
size_mb = os.path.getsize(file_path) / (1024 * 1024)
print(f" {f} ({size_mb:.2f} MB)")
return cache_files
def clear_cache(filename=None, cache_dir=None):
"""Clear specific cache file or all cache."""
if cache_dir is None:
cache_dir = os.path.join(exports_path, "cache")
if filename:
cache_path = os.path.join(cache_dir, f"{filename}.pkl")
if os.path.exists(cache_path):
os.remove(cache_path)
print(f"Removed cache: {filename}")
else:
print(f"Cache not found: {filename}")
else:
# Clear all cache files
if os.path.exists(cache_dir):
import shutil
shutil.rmtree(cache_dir)
print("Cleared all cache files")
else:
print("No cache directory to clear")
# Show current cache status
list_cache_files()
No cache directory found
Out[5]:
[]
Cache Control¶
Use these cells to manage cached data. When testing new code changes, you may want to clear relevant cache files to force re-computation.
In [6]:
# Clear specific cache files (uncomment lines as needed for testing)
# When testing new VoID discovery/generation:
# clear_cache(f"{dataset_name}_voidgraph")
# When testing JSON-LD generation (primary output):
# clear_cache(f"{dataset_name}_jsonld_schema")
# When testing frequency calculations:
# clear_cache(f"{dataset_name}_frequencies_basic")
# clear_cache(f"{dataset_name}_frequencies_with_instances")
# Clear everything:
clear_cache()
print("Cache control ready")
print("Note: VoID graph and JSON-LD are the primary caches")
No cache directory to clear Cache control ready Note: VoID graph and JSON-LD are the primary caches
Discover or get VoID Schema¶
In [7]:
# Discover or generate VoID schema with caching
cache_key = f"{dataset_name}_voidgraph"
# Try to load from cache first
void_graph = load_cache(cache_key)
if void_graph is None:
print("VoID graph not found in cache, attempting discovery...")
# Step 1: Try to discover existing VoID graphs
discovery_result = discover_void_graphs(
endpoint_url, graph_uris=[graph_uri] if graph_uri else None
)
found_graphs = discovery_result.get("found_graphs", [])
partitions = discovery_result.get("partitions", [])
if found_graphs and partitions:
print(f"Found {len(found_graphs)} VoID graphs with {len(partitions)} partitions")
# Build VoID graph directly from partition data (no CONSTRUCT query needed)
void_graph = retrieve_void_from_graphs(
endpoint_url,
found_graphs,
graph_uris=[graph_uri] if graph_uri else None,
partitions=partitions, # Pass partition data directly
)
# Save to file
void_path = os.path.join(exports_path, f"{dataset_name}_existing_void.ttl")
void_graph.serialize(destination=void_path, format="turtle")
print(f"Built VoID graph from: {', '.join(found_graphs)}")
else:
print("No VoID graphs found, generating from queries...")
# Step 2: Generate new VoID if none found
void_graph = generate_void_from_endpoint(
endpoint_url=endpoint_url,
graph_uris=[graph_uri] if graph_uri else None,
output_file=os.path.join(exports_path, f"{dataset_name}_generated_void.ttl"),
counts=True,
offset_limit_steps=300,
exclude_graphs=True,
)
# Cache the VoID graph for future use
save_cache(void_graph, cache_key)
print(f"VoID graph cached with {len(void_graph)} triples")
else:
print(f"Loaded VoID graph from cache ({len(void_graph)} triples)")
# Load parser from the VoID graph
vp = load_parser_from_graph(void_graph, graph_uris=[graph_uri] if graph_uri else None)
VoID graph not found in cache, attempting discovery... 2025-12-02 10:21:23 DEBUG rdfsolve.parser: Starting VoID partition discovery for https://idsm.elixir-czech.cz/sparql/endpoint/idsm
2025-12-02 10:21:23 INFO rdfsolve.parser: Discovering VoID partitions across all graphs
Query attempt 1/3 failed: 500 Server Error: 500 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++PREFIX+void%3A+%3Chttp%3A%2F%2Frdfs.org%2Fns%2Fvoid%23%3E%0A++++++++PREFIX+void-ext%3A+%3Chttp%3A%2F%2Fldf.fi%2Fvoid-ext%23%3E%0A++++++++SELECT+DISTINCT+%3FsubjectClass+%3Fprop+%3FobjectClass+%3FobjectDatatype+%3Fg%0A++++++++WHERE+%7B%0A++++++++++GRAPH+%3Fg+%7B%0A++++++++++++%7B%0A++++++++++++++%3Fcp+void%3Aclass+%3FsubjectClass+%3B%0A++++++++++++++++++void%3ApropertyPartition+%3Fpp+.%0A++++++++++++++%3Fpp+void%3Aproperty+%3Fprop+.%0A++++++++++++++OPTIONAL+%7B%0A++++++++++++++++++%7B%0A++++++++++++++++++++++%3Fpp++void%3AclassPartition+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++++++++%7D+UNION+%7B%0A++++++++++++++++++++++%3Fpp+void-ext%3AdatatypePartition+%5B+void-ext%3Adatatype+%3FobjectDatatype+%5D+.%0A++++++++++++++++++%7D%0A++++++++++++++%7D%0A++++++++++++%7D+UNION+%7B%0A++++++++++++++%3Fls+void%3AsubjectsTarget+%5B+void%3Aclass+%3FsubjectClass+%5D+%3B%0A++++++++++++++++++void%3AlinkPredicate+%3Fprop+%3B%0A++++++++++++++++++void%3AobjectsTarget+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++%7D%0A++++++++++%7D%0A++++++++%7D%0A++++++++
Query attempt 2/3 failed: 500 Server Error: 500 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++PREFIX+void%3A+%3Chttp%3A%2F%2Frdfs.org%2Fns%2Fvoid%23%3E%0A++++++++PREFIX+void-ext%3A+%3Chttp%3A%2F%2Fldf.fi%2Fvoid-ext%23%3E%0A++++++++SELECT+DISTINCT+%3FsubjectClass+%3Fprop+%3FobjectClass+%3FobjectDatatype+%3Fg%0A++++++++WHERE+%7B%0A++++++++++GRAPH+%3Fg+%7B%0A++++++++++++%7B%0A++++++++++++++%3Fcp+void%3Aclass+%3FsubjectClass+%3B%0A++++++++++++++++++void%3ApropertyPartition+%3Fpp+.%0A++++++++++++++%3Fpp+void%3Aproperty+%3Fprop+.%0A++++++++++++++OPTIONAL+%7B%0A++++++++++++++++++%7B%0A++++++++++++++++++++++%3Fpp++void%3AclassPartition+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++++++++%7D+UNION+%7B%0A++++++++++++++++++++++%3Fpp+void-ext%3AdatatypePartition+%5B+void-ext%3Adatatype+%3FobjectDatatype+%5D+.%0A++++++++++++++++++%7D%0A++++++++++++++%7D%0A++++++++++++%7D+UNION+%7B%0A++++++++++++++%3Fls+void%3AsubjectsTarget+%5B+void%3Aclass+%3FsubjectClass+%5D+%3B%0A++++++++++++++++++void%3AlinkPredicate+%3Fprop+%3B%0A++++++++++++++++++void%3AobjectsTarget+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++%7D%0A++++++++++%7D%0A++++++++%7D%0A++++++++
Query attempt 3/3 failed: 500 Server Error: 500 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++PREFIX+void%3A+%3Chttp%3A%2F%2Frdfs.org%2Fns%2Fvoid%23%3E%0A++++++++PREFIX+void-ext%3A+%3Chttp%3A%2F%2Fldf.fi%2Fvoid-ext%23%3E%0A++++++++SELECT+DISTINCT+%3FsubjectClass+%3Fprop+%3FobjectClass+%3FobjectDatatype+%3Fg%0A++++++++WHERE+%7B%0A++++++++++GRAPH+%3Fg+%7B%0A++++++++++++%7B%0A++++++++++++++%3Fcp+void%3Aclass+%3FsubjectClass+%3B%0A++++++++++++++++++void%3ApropertyPartition+%3Fpp+.%0A++++++++++++++%3Fpp+void%3Aproperty+%3Fprop+.%0A++++++++++++++OPTIONAL+%7B%0A++++++++++++++++++%7B%0A++++++++++++++++++++++%3Fpp++void%3AclassPartition+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++++++++%7D+UNION+%7B%0A++++++++++++++++++++++%3Fpp+void-ext%3AdatatypePartition+%5B+void-ext%3Adatatype+%3FobjectDatatype+%5D+.%0A++++++++++++++++++%7D%0A++++++++++++++%7D%0A++++++++++++%7D+UNION+%7B%0A++++++++++++++%3Fls+void%3AsubjectsTarget+%5B+void%3Aclass+%3FsubjectClass+%5D+%3B%0A++++++++++++++++++void%3AlinkPredicate+%3Fprop+%3B%0A++++++++++++++++++void%3AobjectsTarget+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++%7D%0A++++++++++%7D%0A++++++++%7D%0A++++++++
SELECT failed after 3 tries
2025-12-02 10:21:28 INFO rdfsolve.parser: VoID discovery failed: Query failed after 3 attempts: 500 Server Error: 500 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++PREFIX+void%3A+%3Chttp%3A%2F%2Frdfs.org%2Fns%2Fvoid%23%3E%0A++++++++PREFIX+void-ext%3A+%3Chttp%3A%2F%2Fldf.fi%2Fvoid-ext%23%3E%0A++++++++SELECT+DISTINCT+%3FsubjectClass+%3Fprop+%3FobjectClass+%3FobjectDatatype+%3Fg%0A++++++++WHERE+%7B%0A++++++++++GRAPH+%3Fg+%7B%0A++++++++++++%7B%0A++++++++++++++%3Fcp+void%3Aclass+%3FsubjectClass+%3B%0A++++++++++++++++++void%3ApropertyPartition+%3Fpp+.%0A++++++++++++++%3Fpp+void%3Aproperty+%3Fprop+.%0A++++++++++++++OPTIONAL+%7B%0A++++++++++++++++++%7B%0A++++++++++++++++++++++%3Fpp++void%3AclassPartition+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++++++++%7D+UNION+%7B%0A++++++++++++++++++++++%3Fpp+void-ext%3AdatatypePartition+%5B+void-ext%3Adatatype+%3FobjectDatatype+%5D+.%0A++++++++++++++++++%7D%0A++++++++++++++%7D%0A++++++++++++%7D+UNION+%7B%0A++++++++++++++%3Fls+void%3AsubjectsTarget+%5B+void%3Aclass+%3FsubjectClass+%5D+%3B%0A++++++++++++++++++void%3AlinkPredicate+%3Fprop+%3B%0A++++++++++++++++++void%3AobjectsTarget+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++%7D%0A++++++++++%7D%0A++++++++%7D%0A++++++++
2025-12-02 10:21:28 DEBUG rdfsolve.parser: Discovery exception: EndpointError: Query failed after 3 attempts: 500 Server Error: 500 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++PREFIX+void%3A+%3Chttp%3A%2F%2Frdfs.org%2Fns%2Fvoid%23%3E%0A++++++++PREFIX+void-ext%3A+%3Chttp%3A%2F%2Fldf.fi%2Fvoid-ext%23%3E%0A++++++++SELECT+DISTINCT+%3FsubjectClass+%3Fprop+%3FobjectClass+%3FobjectDatatype+%3Fg%0A++++++++WHERE+%7B%0A++++++++++GRAPH+%3Fg+%7B%0A++++++++++++%7B%0A++++++++++++++%3Fcp+void%3Aclass+%3FsubjectClass+%3B%0A++++++++++++++++++void%3ApropertyPartition+%3Fpp+.%0A++++++++++++++%3Fpp+void%3Aproperty+%3Fprop+.%0A++++++++++++++OPTIONAL+%7B%0A++++++++++++++++++%7B%0A++++++++++++++++++++++%3Fpp++void%3AclassPartition+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++++++++%7D+UNION+%7B%0A++++++++++++++++++++++%3Fpp+void-ext%3AdatatypePartition+%5B+void-ext%3Adatatype+%3FobjectDatatype+%5D+.%0A++++++++++++++++++%7D%0A++++++++++++++%7D%0A++++++++++++%7D+UNION+%7B%0A++++++++++++++%3Fls+void%3AsubjectsTarget+%5B+void%3Aclass+%3FsubjectClass+%5D+%3B%0A++++++++++++++++++void%3AlinkPredicate+%3Fprop+%3B%0A++++++++++++++++++void%3AobjectsTarget+%5B+void%3Aclass+%3FobjectClass+%5D+.%0A++++++++++++%7D%0A++++++++++%7D%0A++++++++%7D%0A++++++++
No VoID graphs found, generating from queries...
2025-12-02 10:28:33 WARNING rdfsolve.parser: Query attempt 1 failed: Remote end closed connection without response
2025-12-02 10:28:33 DEBUG rdfsolve.parser: Query:
PREFIX void: <http://rdfs.org/ns/void#>
PREFIX void-ext: <http://ldf.fi/void-ext#>
CONSTRUCT {
?dp void-ext:datatypePartition ?datatype ;
void:triples ?count .
}
WHERE {
{
SELECT ?datatype (COUNT(?s) AS ?count)
WHERE {
GRAPH <http://rdf.ncbi.nlm.nih.gov/pubchem/compound> {
?s ?p ?o .
FILTER(isLiteral(?o))
BIND(datatype(?o) AS ?datatype)
}
}
GROUP BY ?datatype
}
BIND(IRI(CONCAT('http://rdf.ncbi.nlm.nih.gov/pubchem/compound/void/datatype_partition_',
REPLACE(STR(?datatype), '[^a-zA-Z0-9_]', '_', 'g'))) AS ?dp)
}
2025-12-02 10:28:33 INFO rdfsolve.parser: Retrying after 1.1s (attempt 2/4)
2025-12-02 10:33:20 WARNING rdfsolve.parser: Query attempt 2 failed: Remote end closed connection without response
2025-12-02 10:33:20 DEBUG rdfsolve.parser: Query:
PREFIX void: <http://rdfs.org/ns/void#>
PREFIX void-ext: <http://ldf.fi/void-ext#>
CONSTRUCT {
?dp void-ext:datatypePartition ?datatype ;
void:triples ?count .
}
WHERE {
{
SELECT ?datatype (COUNT(?s) AS ?count)
WHERE {
GRAPH <http://rdf.ncbi.nlm.nih.gov/pubchem/compound> {
?s ?p ?o .
FILTER(isLiteral(?o))
BIND(datatype(?o) AS ?datatype)
}
}
GROUP BY ?datatype
}
BIND(IRI(CONCAT('http://rdf.ncbi.nlm.nih.gov/pubchem/compound/void/datatype_partition_',
REPLACE(STR(?datatype), '[^a-zA-Z0-9_]', '_', 'g'))) AS ?dp)
}
2025-12-02 10:33:20 INFO rdfsolve.parser: Retrying after 2.0s (attempt 3/4)
2025-12-02 10:38:07 WARNING rdfsolve.parser: Query attempt 3 failed: Remote end closed connection without response
2025-12-02 10:38:07 DEBUG rdfsolve.parser: Query:
PREFIX void: <http://rdfs.org/ns/void#>
PREFIX void-ext: <http://ldf.fi/void-ext#>
CONSTRUCT {
?dp void-ext:datatypePartition ?datatype ;
void:triples ?count .
}
WHERE {
{
SELECT ?datatype (COUNT(?s) AS ?count)
WHERE {
GRAPH <http://rdf.ncbi.nlm.nih.gov/pubchem/compound> {
?s ?p ?o .
FILTER(isLiteral(?o))
BIND(datatype(?o) AS ?datatype)
}
}
GROUP BY ?datatype
}
BIND(IRI(CONCAT('http://rdf.ncbi.nlm.nih.gov/pubchem/compound/void/datatype_partition_',
REPLACE(STR(?datatype), '[^a-zA-Z0-9_]', '_', 'g'))) AS ?dp)
}
2025-12-02 10:38:07 INFO rdfsolve.parser: Retrying after 4.0s (attempt 4/4)
2025-12-02 10:42:57 WARNING rdfsolve.parser: Query attempt 4 failed: Remote end closed connection without response
2025-12-02 10:42:57 DEBUG rdfsolve.parser: Query:
PREFIX void: <http://rdfs.org/ns/void#>
PREFIX void-ext: <http://ldf.fi/void-ext#>
CONSTRUCT {
?dp void-ext:datatypePartition ?datatype ;
void:triples ?count .
}
WHERE {
{
SELECT ?datatype (COUNT(?s) AS ?count)
WHERE {
GRAPH <http://rdf.ncbi.nlm.nih.gov/pubchem/compound> {
?s ?p ?o .
FILTER(isLiteral(?o))
BIND(datatype(?o) AS ?datatype)
}
}
GROUP BY ?datatype
}
BIND(IRI(CONCAT('http://rdf.ncbi.nlm.nih.gov/pubchem/compound/void/datatype_partition_',
REPLACE(STR(?datatype), '[^a-zA-Z0-9_]', '_', 'g'))) AS ?dp)
}
2025-12-02 10:42:57 ERROR rdfsolve.parser: Query failed after 4 attempts
2025-12-02 10:42:57 WARNING rdfsolve.parser: Query datatype_partitions failed after 1148.70s: Remote end closed connection without response
2025-12-02 10:42:57 INFO rdfsolve.parser: Successfully extracted 651 RDF triples
2025-12-02 10:42:57 INFO rdfsolve.parser: VoID description saved to /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.compound/pubchem.compound_generated_void.ttl
Cached data to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.compound/cache/pubchem.compound_voidgraph.pkl VoID graph cached with 651 triples
Schema Discovery and Exports Workflow¶
Workflow Steps:¶
- VoID Discovery: Extract schema patterns from SPARQL endpoint VoID descriptions
- JSON-LD Generation: Convert to JSON-LD.
- Derived Outputs: All other formats are generated from the JSON-LD structure:
- Frequencies: Schema pattern coverage analysis
- LinkML: LinkML YAML used elsewhere for other features.
- CSV/JSON: Tabular and structured data exports
- RDF: N-Quads serialization for triplestore import
In [8]:
# Primary JSON-LD schema export and basic summary
cache_key = f"{dataset_name}_jsonld_schema"
jsonld_schema = load_cache(cache_key)
if jsonld_schema is None:
print("Generating JSON-LD schema...")
jsonld_schema = vp.to_jsonld(filter_void_admin_nodes=True)
save_cache(jsonld_schema, cache_key)
else:
print("Loaded JSON-LD schema from cache")
# Save JSON-LD schema file
jsonld_file = os.path.join(exports_path, f"{dataset_name}_schema.jsonld")
with open(jsonld_file, "w", encoding="utf-8") as f:
json.dump(jsonld_schema, f, indent=2, ensure_ascii=False)
print(f"JSON-LD Schema saved to: {jsonld_file}")
# Display combined JSON-LD structure info and schema summary
if "@graph" in jsonld_schema:
print("\nSchema Summary:")
print(f" • Prefixes: {len(jsonld_schema['@context'])}")
print(f" • Resources: {len(jsonld_schema['@graph'])}")
# Show dataset metadata
dataset_info = jsonld_schema["@graph"][0] if jsonld_schema["@graph"] else {}
if dataset_info.get("@type") == "void:Dataset":
print(f" • Dataset: {dataset_info.get('dcterms:title', 'Unknown')}")
print(f" • Classes: {dataset_info.get('void:classes', 0)}")
print(f" • Properties: {dataset_info.get('void:properties', 0)}")
print(f" • Triples: {dataset_info.get('void:triples', 0)}")
# Get schema DataFrame and show sample
schema_df = vp.to_schema(filter_void_admin_nodes=True)
print(f"\nSchema Patterns Preview ({len(schema_df)} total):")
display(schema_df.head())
Generating JSON-LD schema...
Cached data to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.compound/cache/pubchem.compound_jsonld_schema.pkl JSON-LD Schema saved to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.compound/pubchem.compound_schema.jsonld Schema Summary: • Prefixes: 7 • Resources: 144
Schema Patterns Preview (24624 total):
| subject_class | subject_uri | property | property_uri | object_class | object_uri | |
|---|---|---|---|---|---|---|
| 0 | vocabulary:Compound | http://rdf.ncbi.nlm.nih.gov/pubchem/vocabulary... | cheminf:000455 | http://semanticscience.org/resource/CHEMINF_00... | ncit:C197 | http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus... |
| 1 | vocabulary:Compound | http://rdf.ncbi.nlm.nih.gov/pubchem/vocabulary... | cheminf:000455 | http://semanticscience.org/resource/CHEMINF_00... | ncit:C609 | http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus... |
| 2 | vocabulary:Compound | http://rdf.ncbi.nlm.nih.gov/pubchem/vocabulary... | cheminf:000455 | http://semanticscience.org/resource/CHEMINF_00... | vocabulary:Compound | http://rdf.ncbi.nlm.nih.gov/pubchem/vocabulary... |
| 3 | vocabulary:Compound | http://rdf.ncbi.nlm.nih.gov/pubchem/vocabulary... | cheminf:000455 | http://semanticscience.org/resource/CHEMINF_00... | ncit:C826 | http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus... |
| 4 | vocabulary:Compound | http://rdf.ncbi.nlm.nih.gov/pubchem/vocabulary... | cheminf:000455 | http://semanticscience.org/resource/CHEMINF_00... | ncit:C1228 | http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus... |
Schema Pattern Coverage Analysis¶
Calculate coverage ratios showing what percentage of entities use each relationship pattern.
In [9]:
# Schema pattern coverage analysis and export
cache_key = f"{dataset_name}_frequencies_basic"
cached_data = load_cache(cache_key)
if cached_data is None:
print("Calculating schema pattern frequencies...")
frequencies_df, _ = vp.count_schema_shape_frequencies(
endpoint_url=endpoint_url,
offset_limit_steps=300,
)
save_cache(frequencies_df, cache_key)
else:
print("Loaded frequencies DataFrame from cache")
frequencies_df = cached_data
# Export coverage analysis
frequencies_output_path = os.path.join(exports_path, f"{dataset_name}_pattern_coverage.csv")
exported_df = vp.export_schema_shape_frequencies(
frequencies_df, output_file=frequencies_output_path
)
# Combined summary and sample
if not frequencies_df.empty:
avg_coverage = frequencies_df["coverage_percent"].mean()
high_coverage = (frequencies_df["coverage_percent"] > 50).sum()
print("\nPattern Coverage Analysis:")
print(f" • Total patterns: {len(frequencies_df)}")
print(f" • Average coverage: {avg_coverage:.1f}%")
print(f" • High coverage (>50%): {high_coverage}")
print(f" • Exported to: {frequencies_output_path}")
print("\nSample Coverage Data:")
display(
frequencies_df[["subject_class", "property", "object_class", "coverage_percent"]].head()
)
print("\nCoverage Statistics:")
display(frequencies_df["coverage_percent"].describe())
else:
print("No frequency data available")
Calculating schema pattern frequencies...
2025-12-02 10:43:09 INFO rdfsolve.parser: Using chunked pagination for entity counts (step size: 300)
INFO:rdfsolve.parser:Using chunked pagination for entity counts (step size: 300)
WARNING:rdfsolve.sparql_helper:Query attempt 1/3 failed: HTTPSConnectionPool(host='idsm.elixir-czech.cz', port=443): Read timed out. (read timeout=60.0)
WARNING:rdfsolve.sparql_helper:Query attempt 2/3 failed: HTTPSConnectionPool(host='idsm.elixir-czech.cz', port=443): Read timed out. (read timeout=60.0)
WARNING:rdfsolve.sparql_helper:Chunk query failed at offset 0: HTTP 400: 400 Client Error: 400 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++SELECT+%3Fclass+%28COUNT%28DISTINCT+%3Fs%29+AS+%3Ftotal%29+WHERE+%7B%0A++++++++++++GRAPH+%3Chttp%3A%2F%2Frdf.ncbi.nlm.nih.gov%2Fpubchem%2Fcompound%3E+%7B%0A++++++++++++++++%3Fs+a+%3Fclass+.%0A++++++++++++%7D%0A++++++++%7D%0A++++++++GROUP+BY+%3Fclass%0A++++++++ORDER+BY+DESC%28%3Ftotal%29%0A++++++++%0AOFFSET+0%0ALIMIT+300
2025-12-02 10:45:16 INFO rdfsolve.parser: Chunked entity counting complete: 0 chunks, 0 total results
INFO:rdfsolve.parser:Chunked entity counting complete: 0 chunks, 0 total results
Cached data to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.compound/cache/pubchem.compound_frequencies_basic.pkl Pattern Coverage Analysis: • Total patterns: 24624 • Average coverage: 0.0% • High coverage (>50%): 0 • Exported to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.compound/pubchem.compound_pattern_coverage.csv Sample Coverage Data:
| subject_class | property | object_class | coverage_percent | |
|---|---|---|---|---|
| 24607 | snomedct:442753007 | cheminf:000455 | snomedct:96230006 | 0.0 |
| 24606 | snomedct:442753007 | cheminf:000455 | snomedct:96096006 | 0.0 |
| 24605 | snomedct:442753007 | cheminf:000455 | snomedct:75362008 | 0.0 |
| 24604 | snomedct:442753007 | cheminf:000455 | snomedct:64011005 | 0.0 |
| 24603 | snomedct:442753007 | cheminf:000455 | snomedct:61025005 | 0.0 |
Coverage Statistics:
count 24624.0 mean 0.0 std 0.0 min 0.0 25% 0.0 50% 0.0 75% 0.0 max 0.0 Name: coverage_percent, dtype: float64
Schema Pattern Instance Collection¶
Collect actual subject and object IRI instances for each schema pattern. This provides detailed access to the specific entities participating in each relationship pattern.
In [10]:
# Collect both frequency data and actual instances with caching
cache_key = f"{dataset_name}_frequencies_with_instances"
cached_data = load_cache(cache_key)
if cached_data is None:
print("Collecting frequency data and instances...")
frequencies_with_instances_df, instances_df = vp.count_schema_shape_frequencies(
endpoint_url=endpoint_url,
# sample_limit=100, # Limited sample for demonstration
collect_instances=True,
offset_limit_steps=300,
)
# Cache both DataFrames as a tuple
save_cache((frequencies_with_instances_df, instances_df), cache_key)
else:
print("Loaded frequencies and instances DataFrames from cache")
frequencies_with_instances_df, instances_df = cached_data
# Display basic information about the data structure
print(f"Frequencies DataFrame: {len(frequencies_with_instances_df)} shapes")
if frequencies_with_instances_df is not None:
print(
f"Memory usage - Instances: {frequencies_with_instances_df.memory_usage(deep=True).sum() / 1024:.1f} KB"
)
else:
print("No instances collected")
Collecting frequency data and instances...
2025-12-02 10:45:21 INFO rdfsolve.parser: Using chunked pagination for entity counts (step size: 300)
INFO:rdfsolve.parser:Using chunked pagination for entity counts (step size: 300)
WARNING:rdfsolve.sparql_helper:Chunk query failed at offset 0: HTTP 400: 400 Client Error: 400 for url: https://idsm.elixir-czech.cz/sparql/endpoint/idsm?query=%0A++++++++SELECT+%3Fclass+%28COUNT%28DISTINCT+%3Fs%29+AS+%3Ftotal%29+WHERE+%7B%0A++++++++++++GRAPH+%3Chttp%3A%2F%2Frdf.ncbi.nlm.nih.gov%2Fpubchem%2Fcompound%3E+%7B%0A++++++++++++++++%3Fs+a+%3Fclass+.%0A++++++++++++%7D%0A++++++++%7D%0A++++++++GROUP+BY+%3Fclass%0A++++++++ORDER+BY+DESC%28%3Ftotal%29%0A++++++++%0AOFFSET+0%0ALIMIT+300
2025-12-02 10:45:26 INFO rdfsolve.parser: Chunked entity counting complete: 0 chunks, 0 total results
INFO:rdfsolve.parser:Chunked entity counting complete: 0 chunks, 0 total results
Cached data to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.compound/cache/pubchem.compound_frequencies_with_instances.pkl Frequencies DataFrame: 24624 shapes Memory usage - Instances: 17825.7 KB
In [11]:
import pandas as pd
import plotly.graph_objects as go
if not frequencies_with_instances_df.empty:
df = frequencies_with_instances_df.copy()
df["coverage_percent"] = pd.to_numeric(df["coverage_percent"], errors="coerce").fillna(0)
df = df.sort_values("coverage_percent", ascending=False).reset_index(drop=True)
def make_label(row):
return (
f"<b>{row['subject_class']}</b> "
f"<span style='color:#888;'></span> "
f"<i>{row['property']}</i> "
f"<span style='color:#888;'></span> "
f"<b>{row['object_class']}</b>"
)
df["styled_label"] = df.apply(make_label, axis=1)
text_positions = ["outside" if v < 95 else "inside" for v in df["coverage_percent"]]
custom_colorscale = [
[0.0, "#d36e61"],
[0.4, "#e5cdbd"],
[0.7, "#e8e4cf"],
[1.0, "#c3d9c0"],
]
# Figure sizing
bar_height = 26
fig_height = min(2000, bar_height * len(df) + 200)
fig = go.Figure(
go.Bar(
x=df["coverage_percent"],
y=df["styled_label"],
orientation="h",
text=[f"{v:.1f}%" for v in df["coverage_percent"]],
textposition=text_positions,
marker={
"color": df["coverage_percent"],
"colorscale": custom_colorscale,
"cmin": 0,
"cmax": 100,
"line": {"color": "white", "width": 0.6},
},
hovertemplate="<b>%{y}</b><br>Coverage: %{x:.1f}%<extra></extra>",
)
)
fig.update_layout(
title={
"text": f"Schema Pattern Coverage for {dataset_name}",
"x": 0.5,
"font": {"size": 18},
},
xaxis={
"title": "Coverage (%)",
"range": [0, 100], # fixed x-axis range
"ticksuffix": "%",
"showgrid": True,
"gridcolor": "rgba(220,220,220,0.3)",
},
yaxis={
"title": "",
"autorange": "reversed",
"automargin": True,
"fixedrange": False, # allow vertical zoom/pan
},
template="plotly_white",
autosize=True, # allow figure to scale with container
height=fig_height, # base height (will scale)
margin={"t": 80, "b": 50, "l": 480, "r": 150}, # extra right margin for text
plot_bgcolor="white",
paper_bgcolor="white",
)
# Disable horizontal zoom/pan
fig.update_xaxes(fixedrange=True)
# Show figure with config for HTML export compatibility
fig.show(
config={
"scrollZoom": True,
"responsive": True,
"toImageButtonOptions": {
"format": "png",
"filename": f"{dataset_name}_schema_coverage",
"height": fig_height,
"width": 600,
"scale": 1,
},
}
)
else:
display(Markdown("**No coverage data to visualize**"))
LinkML (derived from JSON-LD)¶
In [12]:
# Generate LinkML directly from JSON-LD with custom schema URI
print("Regenerating LinkML schema from JSON-LD with custom schema URI...")
schema_name = f"{dataset_name}_schema"
custom_schema_uri = (
f"http://jmillanacosta.github.io/rdfsolve/{dataset_name}/linkml" # User-definable base URI
)
yaml_text = vp.to_linkml_yaml(
schema_name=schema_name,
schema_description=f"LinkML schema for {dataset_name} generated from JSON-LD",
schema_base_uri=custom_schema_uri,
filter_void_nodes=True,
)
# Save to LinkML YAML
linkml_file = os.path.join(exports_path, f"{dataset_name}_linkml_schema.yaml")
with open(linkml_file, "w", encoding="utf-8") as f:
f.write(yaml_text)
print(f"LinkML YAML saved to: {linkml_file}")
Regenerating LinkML schema from JSON-LD with custom schema URI...
LinkML YAML saved to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.compound/pubchem.compound_linkml_schema.yaml
In [13]:
from linkml.generators.erdiagramgen import ERDiagramGenerator
from linkml_runtime.utils.schemaview import SchemaView
sv = SchemaView(linkml_file)
linkml_schema = sv.schema
display(
Markdown(
f"**Parsed LinkML schema:** Classes = {len(sv.all_classes())}, Slots = {len(sv.all_slots())}"
)
)
# Build and display a Mermaid class diagram for the aopwikirdf LinkedML
mermaid_code = ERDiagramGenerator(linkml_file).serialize()
display(Markdown(mermaid_code))
Parsed LinkML schema: Classes = 145, Slots = 1
erDiagram
Chebi113553 {
}
Chebi131344 {
}
Chebi132359 {
}
Chebi132932 {
}
Chebi133455 {
}
Chebi136855 {
}
Chebi145334 {
}
Chebi145791 {
}
Chebi15377 {
}
Chebi160394 {
}
Chebi166530 {
}
Chebi170120 {
}
Chebi176724 {
}
Chebi178092 {
}
Chebi179801 {
}
Chebi179945 {
}
Chebi182413 {
}
Chebi18377 {
}
Chebi18419 {
}
Chebi18422 {
}
Chebi187328 {
}
Chebi192407 {
}
Chebi192730 {
}
Chebi194815 {
}
Chebi195509 {
}
Chebi195930 {
}
Chebi230581 {
}
Chebi234135 {
}
Chebi28182 {
}
Chebi28997 {
}
Chebi29081 {
}
Chebi29236 {
}
Chebi32497 {
}
Chebi33149 {
}
Chebi33493 {
}
Chebi35616 {
}
Chebi36412 {
}
Chebi36769 {
}
Chebi36794 {
}
Chebi36932 {
}
Chebi37368 {
}
Chebi37973 {
}
Chebi38572 {
}
Chebi38685 {
}
Chebi39187 {
}
Chebi39427 {
}
Chebi4027 {
}
Chebi42101 {
}
Chebi4762 {
}
Chebi49664 {
}
Chebi51434 {
}
Chebi5435 {
}
Chebi59349 {
}
Chebi64629 {
}
Chebi67187 {
}
Chebi6802 {
}
Chebi74741 {
}
Chebi7551 {
}
Chebi76211 {
}
Chebi81981 {
}
Chebi83464 {
}
Chebi83467 {
}
Chebi8354 {
}
Chebi8481 {
}
Chebi85057 {
}
Chebi87594 {
}
Chebi87686 {
}
Chebi88370 {
}
Chebi88432 {
}
Chebi88811 {
}
Chebi89327 {
}
Chebi92422 {
}
Chebi92905 {
}
Chebi93191 {
}
Chebi93856 {
}
Chebi95131 {
}
NcitC118885 {
}
NcitC1318 {
}
NcitC197 {
}
NcitC26563 {
}
NcitC28945 {
}
NcitC29290 {
}
NcitC29792 {
}
NcitC44390 {
}
NcitC47397 {
}
NcitC47538 {
}
NcitC61925 {
}
NcitC629 {
}
NcitC65269 {
}
NcitC65379 {
}
NcitC65479 {
}
NcitC68274 {
}
NcitC68395 {
}
NcitC72106 {
}
NcitC73045 {
}
NcitC733 {
}
NcitC740 {
}
NcitC76663 {
}
NcitC80813 {
}
NcitC83634 {
}
NcitC84106 {
}
NcitC84867 {
}
NcitC87413 {
}
NcitC92219 {
}
NcitC97358 {
}
NdfrtN00006003 {
}
NdfrtN00006913 {
}
NdfrtN00007065 {
}
NdfrtN00007181 {
}
NdfrtN00007276 {
}
NdfrtN00007302 {
}
NdfrtN00007443 {
}
NdfrtN00166218 {
}
NdfrtN00166488 {
}
NdfrtN00166729 {
}
NdfrtN00167030 {
}
NdfrtN00167185 {
}
NdfrtN00171426 {
}
NdfrtN00178951 {
}
NdfrtN00179248 {
}
NdfrtN00179765 {
}
Sio010004 {
}
Snomedct115458002 {
}
Snomedct11713004 {
}
Snomedct14125007 {
}
Snomedct21540002 {
}
Snomedct228101007 {
}
Snomedct23079006 {
}
Snomedct23398000 {
}
Snomedct25494000 {
}
Snomedct259136004 {
}
Snomedct319542009 {
}
Snomedct321187004 {
}
Snomedct37566001 {
}
Snomedct411089001 {
}
Snomedct416859008 {
}
Snomedct419594005 {
}
Snomedct442753007 {
}
Snomedct54378000 {
}
Snomedct6910009 {
}
Snomedct69236009 {
}
Snomedct74583006 {
}
Snomedct77431009 {
}
Snomedct87567009 {
}
VocabularyCompound {
}
Chebi113553 ||--|o NcitC197 : "cheminf_000455"
Chebi131344 ||--|o NcitC197 : "cheminf_000455"
Chebi132359 ||--|o NcitC197 : "cheminf_000455"
Chebi132932 ||--|o NcitC197 : "cheminf_000455"
Chebi133455 ||--|o NcitC197 : "cheminf_000455"
Chebi136855 ||--|o NcitC197 : "cheminf_000455"
Chebi145334 ||--|o NcitC197 : "cheminf_000455"
Chebi145791 ||--|o NcitC197 : "cheminf_000455"
Chebi15377 ||--|o NcitC197 : "cheminf_000455"
Chebi160394 ||--|o NcitC197 : "cheminf_000455"
Chebi166530 ||--|o NcitC197 : "cheminf_000455"
Chebi170120 ||--|o NcitC197 : "cheminf_000455"
Chebi176724 ||--|o NcitC197 : "cheminf_000455"
Chebi178092 ||--|o NcitC197 : "cheminf_000455"
Chebi179801 ||--|o NcitC197 : "cheminf_000455"
Chebi179945 ||--|o NcitC197 : "cheminf_000455"
Chebi182413 ||--|o NcitC197 : "cheminf_000455"
Chebi18377 ||--|o NcitC197 : "cheminf_000455"
Chebi18419 ||--|o NcitC197 : "cheminf_000455"
Chebi18422 ||--|o NcitC197 : "cheminf_000455"
Chebi187328 ||--|o NcitC197 : "cheminf_000455"
Chebi192407 ||--|o NcitC197 : "cheminf_000455"
Chebi192730 ||--|o NcitC197 : "cheminf_000455"
Chebi194815 ||--|o NcitC197 : "cheminf_000455"
Chebi195509 ||--|o NcitC197 : "cheminf_000455"
Chebi195930 ||--|o NcitC197 : "cheminf_000455"
Chebi230581 ||--|o NcitC197 : "cheminf_000455"
Chebi234135 ||--|o NcitC197 : "cheminf_000455"
Chebi28182 ||--|o NcitC197 : "cheminf_000455"
Chebi28997 ||--|o NcitC197 : "cheminf_000455"
Chebi29081 ||--|o NcitC197 : "cheminf_000455"
Chebi29236 ||--|o NcitC197 : "cheminf_000455"
Chebi32497 ||--|o NcitC197 : "cheminf_000455"
Chebi33149 ||--|o NcitC197 : "cheminf_000455"
Chebi33493 ||--|o NcitC197 : "cheminf_000455"
Chebi35616 ||--|o NcitC197 : "cheminf_000455"
Chebi36412 ||--|o NcitC197 : "cheminf_000455"
Chebi36769 ||--|o NcitC197 : "cheminf_000455"
Chebi36794 ||--|o NcitC197 : "cheminf_000455"
Chebi36932 ||--|o NcitC197 : "cheminf_000455"
Chebi37368 ||--|o NcitC197 : "cheminf_000455"
Chebi37973 ||--|o NcitC197 : "cheminf_000455"
Chebi38572 ||--|o NcitC197 : "cheminf_000455"
Chebi38685 ||--|o NcitC197 : "cheminf_000455"
Chebi39187 ||--|o NcitC197 : "cheminf_000455"
Chebi39427 ||--|o NcitC197 : "cheminf_000455"
Chebi4027 ||--|o NcitC197 : "cheminf_000455"
Chebi42101 ||--|o NcitC197 : "cheminf_000455"
Chebi4762 ||--|o NcitC197 : "cheminf_000455"
Chebi49664 ||--|o NcitC197 : "cheminf_000455"
Chebi51434 ||--|o NcitC197 : "cheminf_000455"
Chebi5435 ||--|o NcitC197 : "cheminf_000455"
Chebi59349 ||--|o NcitC197 : "cheminf_000455"
Chebi64629 ||--|o NcitC197 : "cheminf_000455"
Chebi67187 ||--|o NcitC197 : "cheminf_000455"
Chebi6802 ||--|o NcitC197 : "cheminf_000455"
Chebi74741 ||--|o NcitC197 : "cheminf_000455"
Chebi7551 ||--|o NcitC197 : "cheminf_000455"
Chebi76211 ||--|o NcitC197 : "cheminf_000455"
Chebi81981 ||--|o NcitC197 : "cheminf_000455"
Chebi83464 ||--|o NcitC197 : "cheminf_000455"
Chebi83467 ||--|o NcitC197 : "cheminf_000455"
Chebi8354 ||--|o NcitC197 : "cheminf_000455"
Chebi8481 ||--|o NcitC197 : "cheminf_000455"
Chebi85057 ||--|o NcitC197 : "cheminf_000455"
Chebi87594 ||--|o NcitC197 : "cheminf_000455"
Chebi87686 ||--|o NcitC197 : "cheminf_000455"
Chebi88370 ||--|o NcitC197 : "cheminf_000455"
Chebi88432 ||--|o NcitC197 : "cheminf_000455"
Chebi88811 ||--|o NcitC197 : "cheminf_000455"
Chebi89327 ||--|o NcitC197 : "cheminf_000455"
Chebi92422 ||--|o NcitC197 : "cheminf_000455"
Chebi92905 ||--|o NcitC197 : "cheminf_000455"
Chebi93191 ||--|o NcitC197 : "cheminf_000455"
Chebi93856 ||--|o NcitC197 : "cheminf_000455"
Chebi95131 ||--|o NcitC197 : "cheminf_000455"
NcitC118885 ||--|o NcitC197 : "cheminf_000455"
NcitC1318 ||--|o NcitC197 : "cheminf_000455"
NcitC26563 ||--|o NcitC197 : "cheminf_000455"
NcitC28945 ||--|o NcitC197 : "cheminf_000455"
NcitC29290 ||--|o NcitC197 : "cheminf_000455"
NcitC29792 ||--|o NcitC197 : "cheminf_000455"
NcitC44390 ||--|o NcitC197 : "cheminf_000455"
NcitC47397 ||--|o NcitC197 : "cheminf_000455"
NcitC47538 ||--|o NcitC197 : "cheminf_000455"
NcitC61925 ||--|o NcitC197 : "cheminf_000455"
NcitC629 ||--|o NcitC197 : "cheminf_000455"
NcitC65269 ||--|o NcitC197 : "cheminf_000455"
NcitC65379 ||--|o NcitC197 : "cheminf_000455"
NcitC65479 ||--|o NcitC197 : "cheminf_000455"
NcitC68274 ||--|o NcitC197 : "cheminf_000455"
NcitC68395 ||--|o NcitC197 : "cheminf_000455"
NcitC72106 ||--|o NcitC197 : "cheminf_000455"
NcitC73045 ||--|o NcitC197 : "cheminf_000455"
NcitC733 ||--|o NcitC197 : "cheminf_000455"
NcitC740 ||--|o NcitC197 : "cheminf_000455"
NcitC76663 ||--|o NcitC197 : "cheminf_000455"
NcitC80813 ||--|o NcitC197 : "cheminf_000455"
NcitC83634 ||--|o NcitC197 : "cheminf_000455"
NcitC84106 ||--|o NcitC197 : "cheminf_000455"
NcitC84867 ||--|o NcitC197 : "cheminf_000455"
NcitC87413 ||--|o NcitC197 : "cheminf_000455"
NcitC92219 ||--|o NcitC197 : "cheminf_000455"
NcitC97358 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00006003 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00006913 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00007065 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00007181 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00007276 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00007302 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00007443 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00166218 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00166488 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00166729 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00167030 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00167185 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00171426 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00178951 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00179248 ||--|o NcitC197 : "cheminf_000455"
NdfrtN00179765 ||--|o NcitC197 : "cheminf_000455"
Sio010004 ||--|o NcitC197 : "cheminf_000455"
Snomedct115458002 ||--|o NcitC197 : "cheminf_000455"
Snomedct11713004 ||--|o NcitC197 : "cheminf_000455"
Snomedct14125007 ||--|o NcitC197 : "cheminf_000455"
Snomedct21540002 ||--|o NcitC197 : "cheminf_000455"
Snomedct228101007 ||--|o NcitC197 : "cheminf_000455"
Snomedct23079006 ||--|o NcitC197 : "cheminf_000455"
Snomedct23398000 ||--|o NcitC197 : "cheminf_000455"
Snomedct25494000 ||--|o NcitC197 : "cheminf_000455"
Snomedct259136004 ||--|o NcitC197 : "cheminf_000455"
Snomedct319542009 ||--|o NcitC197 : "cheminf_000455"
Snomedct321187004 ||--|o NcitC197 : "cheminf_000455"
Snomedct37566001 ||--|o NcitC197 : "cheminf_000455"
Snomedct411089001 ||--|o NcitC197 : "cheminf_000455"
Snomedct416859008 ||--|o NcitC197 : "cheminf_000455"
Snomedct419594005 ||--|o NcitC197 : "cheminf_000455"
Snomedct442753007 ||--|o NcitC197 : "cheminf_000455"
Snomedct54378000 ||--|o NcitC197 : "cheminf_000455"
Snomedct6910009 ||--|o NcitC197 : "cheminf_000455"
Snomedct69236009 ||--|o NcitC197 : "cheminf_000455"
Snomedct74583006 ||--|o NcitC197 : "cheminf_000455"
Snomedct77431009 ||--|o NcitC197 : "cheminf_000455"
Snomedct87567009 ||--|o NcitC197 : "cheminf_000455"
VocabularyCompound ||--|o NcitC197 : "cheminf_000455"
In [14]:
json_path = os.path.join(exports_path, f"{dataset_name}_schema.json")
csv_path = os.path.join(exports_path, f"{dataset_name}_schema.csv")
# Export CSV from frequencies
frequencies_df.to_csv(csv_path, index=False)
# Export JSON derived from JSON-LD (maintains consistency)
with open(json_path, "w", encoding="utf-8") as fh:
json.dump(vp.to_json(filter_void_nodes=True), fh, indent=2)
print(f"CSV exported to: {csv_path}")
print(f"JSON exported to: {json_path}")
CSV exported to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.compound/pubchem.compound_schema.csv JSON exported to: /home/runner/work/rdfsolve/rdfsolve/notebooks/01_schema_extraction/../../docs/data/schema_extraction/pubchem.compound/pubchem.compound_schema.json
In [15]:
# Export collected SPARQL queries as TTL
queries_path = os.path.join(exports_path, f"{dataset_name}_sparql_queries.ttl")
queries = SparqlHelper.get_collected_queries()
if queries:
ttl_content = SparqlHelper.export_queries_as_ttl(
output_file=queries_path,
base_uri=f"https://github.com/jmillanacosta/rdfsolve/sparql/{dataset_name}/",
dataset_name=dataset_name,
)
print(f"Exported {len(queries)} SPARQL queries to: {queries_path}")
else:
print("No SPARQL queries were collected")
No SPARQL queries were collected